library(lubridate)
library(ggplot2)
library(tidyverse)
library(data.table)
library(lubridate)
library(stringr)
library(ggplot2)
library(plotly)
features_data_set <- read.csv2("Features data set.csv", header = TRUE, sep = ",")
sales_data_set <- read.csv2("sales data-set.csv", header = TRUE, sep = ",")
stores_data_set <- read.csv2("stores data-set.csv", header = TRUE, sep = ",")
str(features_data_set)
## 'data.frame': 8190 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Factor w/ 182 levels "01/02/2013","01/03/2013",..: 25 67 109 151 26 68 110 152 8 50 ...
## $ Temperature : Factor w/ 4178 levels "-2.06","-6.08",..: 1066 864 934 1308 1300 1998 1797 1601 2315 2579 ...
## $ Fuel_Price : Factor w/ 1011 levels "2.472","2.513",..: 16 10 3 12 43 58 88 95 87 121 ...
## $ MarkDown1 : Factor w/ 4023 levels "-16.93","-2781.45",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : Factor w/ 2715 levels "-0.01","-0.05",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : Factor w/ 2885 levels "-0.2","-0.73",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : Factor w/ 3405 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : Factor w/ 4045 levels "-185.17","-37.02",..: NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : Factor w/ 2505 levels "126.064","126.0766452",..: 1124 1143 1148 1150 1154 1157 1139 1115 1096 1084 ...
## $ Unemployment: Factor w/ 404 levels "10.064","10.115",..: 286 286 286 286 286 286 286 286 252 252 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
str(sales_data_set)
## 'data.frame': 421570 obs. of 5 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Factor w/ 143 levels "01/04/2011","01/06/2012",..: 20 53 86 119 21 54 87 120 6 39 ...
## $ Weekly_Sales: Factor w/ 359464 levels "-0.02","-0.04",..: 140272 237401 220814 101950 120302 114885 122532 148007 272542 225936 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
str(stores_data_set)
## 'data.frame': 45 obs. of 3 variables:
## $ Store: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 2 1 2 1 2 1 2 2 ...
## $ Size : int 151315 202307 37392 205863 34875 202505 70713 155078 125833 126512 ...
features_data_set$Year <- substr(features_data_set$Date, 7, 10)
features_data_set$Month <- substr(features_data_set$Date, 4, 5)
features_data_set$Day <- substr(features_data_set$Date, 1, 2)
sales_data_set$Year <- substr(sales_data_set$Date, 7, 10)
sales_data_set$Month <- substr(sales_data_set$Date, 4, 5)
sales_data_set$Day <- substr(sales_data_set$Date, 1, 2)
sales_data_set$Weekly_Sales <- as.character(sales_data_set$Weekly_Sales)
sales_data_set$Weekly_Sales <- as.numeric(sales_data_set$Weekly_Sales,2)
ggplot(stores_data_set, aes(Type, fill = Type ) ) +
geom_bar() +
xlab("Type of Store") + ylab("Count of Store")
YearSales <- sales_data_set %>% group_by(Year) %>% summarise(YearSales = sum(Weekly_Sales))
ggplot(YearSales, aes(Year, YearSales)) +
geom_col()
SalesStore <- left_join(sales_data_set, stores_data_set, by = "Store")
ggplot(SalesStore, aes(Type, Size) ,log = "xy") +
geom_point()
plot(SalesStore$Size,SalesStore$Weekly_Sales, main = "Size vs Sales", xlab = "Store Size", ylab = "Weekly Sales")
Looking at the percentages of the montly sales by the stores
Sales<-data.table(sales_data_set)
Features<-data.table(features_data_set)
Stores<-data.table(stores_data_set)
Sales<-Sales[,list(Store,Dept,Date,Weekly_Sales)]
setkey(Sales,Store,Date)
setkey(Features,Store,Date)
Sales<-Features[Sales]
setkey(Sales,Store)
setkey(Stores,Store)
Sales<-Stores[Sales]
str(Sales)
## Classes 'data.table' and 'data.frame': 421570 obs. of 19 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Date : Factor w/ 143 levels "01/04/2011","01/06/2012",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Temperature : Factor w/ 4178 levels "-2.06","-6.08",..: 2096 2096 2096 2096 2096 2096 2096 2096 2096 2096 ...
## $ Fuel_Price : Factor w/ 1011 levels "2.472","2.513",..: 558 558 558 558 558 558 558 558 558 558 ...
## $ MarkDown1 : Factor w/ 4023 levels "-16.93","-2781.45",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : Factor w/ 2715 levels "-0.01","-0.05",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : Factor w/ 2885 levels "-0.2","-0.73",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : Factor w/ 3405 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : Factor w/ 4045 levels "-185.17","-37.02",..: NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : Factor w/ 2505 levels "126.064","126.0766452",..: 1524 1524 1524 1524 1524 1524 1524 1524 1524 1524 ...
## $ Unemployment: Factor w/ 404 levels "10.064","10.115",..: 238 238 238 238 238 238 238 238 238 238 ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Year : chr "2011" "2011" "2011" "2011" ...
## $ Month : chr "04" "04" "04" "04" ...
## $ Day : chr "01" "01" "01" "01" ...
## $ Dept : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Weekly_Sales: num 20398 46992 8734 34452 23599 ...
## - attr(*, "sorted")= chr "Store"
## - attr(*, ".internal.selfref")=<externalptr>
summary(Sales)
## Store Type Size Date
## Min. : 1.0 A:215478 Min. : 34875 23/12/2011: 3027
## 1st Qu.:11.0 B:163495 1st Qu.: 93638 25/11/2011: 3021
## Median :22.0 C: 42597 Median :140167 16/12/2011: 3013
## Mean :22.2 Mean :136728 09/12/2011: 3010
## 3rd Qu.:33.0 3rd Qu.:202505 17/02/2012: 3007
## Max. :45.0 Max. :219622 30/12/2011: 3003
## (Other) :403489
## Temperature Fuel_Price MarkDown1 MarkDown2
## 50.43 : 709 3.638 : 2548 1.5 : 102 1.91 : 539
## 67.87 : 646 3.63 : 2164 460.73 : 102 3 : 493
## 72.62 : 594 2.771 : 1917 175.64 : 93 0.5 : 485
## 76.67 : 583 3.891 : 1856 1282.42: 75 1.5 : 471
## 70.28 : 563 3.594 : 1796 1483.17: 75 4 : 367
## 76.03 : 555 3.524 : 1793 (Other):150234 (Other):108893
## (Other):417920 (Other):409496 NA's :270889 NA's :310322
## MarkDown3 MarkDown4 MarkDown5 CPI
## 3 : 754 9 : 280 2743.18 : 136 129.8555333: 711
## 6 : 710 4 : 200 1064.56 : 120 131.1083333: 708
## 2 : 660 2 : 197 20371.02: 75 129.8459667: 707
## 1 : 611 3 : 146 3557.67 : 75 130.3849032: 706
## 0.22 : 487 47 : 143 3567.03 : 75 130.6457931: 706
## (Other):133869 (Other):134001 (Other) :150951 130.683 : 706
## NA's :284479 NA's :286603 NA's :270138 (Other) :417326
## Unemployment IsHoliday Year Month
## 8.099 : 5152 Mode :logical Length:421570 Length:421570
## 8.163 : 3636 FALSE:391909 Class :character Class :character
## 7.852 : 3614 TRUE :29661 Mode :character Mode :character
## 7.343 : 3416
## 7.057 : 3414
## 7.931 : 3400
## (Other):398938
## Day Dept Weekly_Sales
## Length:421570 Min. : 1.00 Min. : -4989
## Class :character 1st Qu.:18.00 1st Qu.: 2080
## Mode :character Median :37.00 Median : 7612
## Mean :44.26 Mean : 15981
## 3rd Qu.:74.00 3rd Qu.: 20206
## Max. :99.00 Max. :693099
##
Sales[,Month:=as.numeric(substring(as.character(Date),4,5))]
Sales$Date<-dmy(Sales$Date)
MonthlySales<-Sales[,sum(Weekly_Sales,na.rm = TRUE),.(Store,Month)]
setnames(MonthlySales,"V1","Monthly_Sales")
MonthlySales[,TotalSales:=sum(Monthly_Sales,na.rm = TRUE),.(Month)]
MonthlySales[,SalesPercantage:=Monthly_Sales*1.0/TotalSales]
Clusno<-5
CM=dcast.data.table(MonthlySales,Store~Month,value.var="SalesPercantage")
S<-colnames(CM)
CM<-data.frame(CM)
CM[is.na(CM)]=0
colnames(CM)<-S
CM<-data.table(CM)
# basl<-which(colnames(rr)=="2")
# bitis<-which(colnames(rr)=="269")
set.seed(7)
CM[,clusno:=kmeans(CM[,c(2:ncol(CM)),with=F],Clusno)$cluster]
clusters<-CM[,list(Store,clusno)]
setkey(clusters,Store)
setkey(MonthlySales,Store)
MonthlySales<-clusters[MonthlySales]
SalesP<-dcast.data.table(MonthlySales,Month~Store,value.var="SalesPercantage")
MonthlySales$Month <- factor(MonthlySales$Month)
MonthlySales$Store <- factor(MonthlySales$Store)
MonthlySales$clusno <- factor(MonthlySales$clusno)
# plotting reference lines across each facet:
referenceLines <- MonthlySales # \/ Rename
colnames(referenceLines)[2] <- "groupVar"
zp <- ggplot(MonthlySales,
aes(x = Month, y = SalesPercantage))
zp <- zp + geom_line(data = referenceLines, # Plotting the "underlayer"
aes(x = Month, y = SalesPercantage, group = groupVar),
colour = "GRAY", alpha = 1/2, size = 1/2)
zp <- zp + geom_line(size = 1) # Drawing the "overlayer"
zp <- zp + facet_wrap(~ Store)
zp <- zp + theme_bw()
ggplotly()
ggplot(MonthlySales, aes(x=Month, y=SalesPercantage, color=clusno, group=Store)) +
geom_line()
ggplotly()
###References #_Retail Data Analytics. (2017, August). Retrieved from https://www.kaggle.com/manjeetsingh/retaildataset_